1. Setup & Imports¶

In [1]:
!pip install pandas matplotlib seaborn scipy openpyxl
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in c:\users\dell\appdata\roaming\python\python313\site-packages (2.3.2)
Requirement already satisfied: matplotlib in c:\users\dell\appdata\roaming\python\python313\site-packages (3.10.5)
Requirement already satisfied: seaborn in c:\users\dell\appdata\roaming\python\python313\site-packages (0.13.2)
Requirement already satisfied: scipy in c:\users\dell\appdata\roaming\python\python313\site-packages (1.16.1)
Requirement already satisfied: openpyxl in c:\users\dell\appdata\roaming\python\python313\site-packages (3.1.5)
Requirement already satisfied: numpy>=1.26.0 in c:\users\dell\appdata\roaming\python\python313\site-packages (from pandas) (2.3.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\dell\appdata\roaming\python\python313\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\dell\appdata\roaming\python\python313\site-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in c:\users\dell\appdata\roaming\python\python313\site-packages (from pandas) (2025.2)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (1.3.3)
Requirement already satisfied: cycler>=0.10 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (4.59.1)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (1.4.9)
Requirement already satisfied: packaging>=20.0 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\dell\appdata\roaming\python\python313\site-packages (from matplotlib) (3.2.3)
Requirement already satisfied: et-xmlfile in c:\users\dell\appdata\roaming\python\python313\site-packages (from openpyxl) (2.0.0)
Requirement already satisfied: six>=1.5 in c:\users\dell\appdata\roaming\python\python313\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
In [2]:
# Core packages
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Stats
from scipy import stats

# Settings
pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

2. Load Data¶

purchase behaviour¶

In [3]:
# Load purchase behaviour
purchase_behaviour = pd.read_csv("QVI_purchase_behaviour.csv")

# Preview
print(f"Purchase Behaviour Shape is: {purchase_behaviour.shape}")
display(purchase_behaviour.head())
Purchase Behaviour Shape is: (72637, 3)
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
0 1000 YOUNG SINGLES/COUPLES Premium
1 1002 YOUNG SINGLES/COUPLES Mainstream
2 1003 YOUNG FAMILIES Budget
3 1004 OLDER SINGLES/COUPLES Mainstream
4 1005 MIDAGE SINGLES/COUPLES Mainstream

transaction data¶

In [4]:
# Load transaction data
transaction_data = pd.read_excel("QVI_transaction_data.xlsx")

# Preview
print(f"Transaction Data Shape is: {transaction_data.shape}")
display(transaction_data.head())
Transaction Data Shape is: (264836, 8)
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
0 43390 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0
1 43599 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3
2 43605 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9
3 43329 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0
4 43330 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8
In [ ]:
 

3. Data Understanding¶

  • purchase_behaviour: contains LYLTY_CARD_NBR, LIFESTAGE, PREMIUM_CUSTOMER

  • transaction_data: contains LYLTY_CARD_NBR, DATE, PROD_NAME, PROD_QTY, TOT_SALES, STORE_NBR

In [5]:
display("Data Shape:", purchase_behaviour.shape)
display("\nData Types:\n", purchase_behaviour.dtypes)
display("\nMissing Values:\n", purchase_behaviour.isnull().sum())
display("\nUnique Values per Column:\n", purchase_behaviour.nunique())
display("\nStatistical Summary:\n", purchase_behaviour.describe(include='all'))
'Data Shape:'
(72637, 3)
'\nData Types:\n'
LYLTY_CARD_NBR       int64
LIFESTAGE           object
PREMIUM_CUSTOMER    object
dtype: object
'\nMissing Values:\n'
LYLTY_CARD_NBR      0
LIFESTAGE           0
PREMIUM_CUSTOMER    0
dtype: int64
'\nUnique Values per Column:\n'
LYLTY_CARD_NBR      72637
LIFESTAGE               7
PREMIUM_CUSTOMER        3
dtype: int64
'\nStatistical Summary:\n'
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
count 7.263700e+04 72637 72637
unique NaN 7 3
top NaN RETIREES Mainstream
freq NaN 14805 29245
mean 1.361859e+05 NaN NaN
std 8.989293e+04 NaN NaN
min 1.000000e+03 NaN NaN
25% 6.620200e+04 NaN NaN
50% 1.340400e+05 NaN NaN
75% 2.033750e+05 NaN NaN
max 2.373711e+06 NaN NaN
In [6]:
display("Data Shape:", transaction_data.shape)
display("\nData Types:\n", transaction_data.dtypes)
display("\nMissing Values:\n", transaction_data.isnull().sum())
display("\nUnique Values per Column:\n", transaction_data.nunique())
display("\nStatistical Summary:\n", transaction_data.describe(include='all'))
'Data Shape:'
(264836, 8)
'\nData Types:\n'
DATE                int64
STORE_NBR           int64
LYLTY_CARD_NBR      int64
TXN_ID              int64
PROD_NBR            int64
PROD_NAME          object
PROD_QTY            int64
TOT_SALES         float64
dtype: object
'\nMissing Values:\n'
DATE              0
STORE_NBR         0
LYLTY_CARD_NBR    0
TXN_ID            0
PROD_NBR          0
PROD_NAME         0
PROD_QTY          0
TOT_SALES         0
dtype: int64
'\nUnique Values per Column:\n'
DATE                 364
STORE_NBR            272
LYLTY_CARD_NBR     72637
TXN_ID            263127
PROD_NBR             114
PROD_NAME            114
PROD_QTY               6
TOT_SALES            112
dtype: int64
'\nStatistical Summary:\n'
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
count 264836.000000 264836.00000 2.648360e+05 2.648360e+05 264836.000000 264836 264836.000000 264836.000000
unique NaN NaN NaN NaN NaN 114 NaN NaN
top NaN NaN NaN NaN NaN Kettle Mozzarella Basil & Pesto 175g NaN NaN
freq NaN NaN NaN NaN NaN 3304 NaN NaN
mean 43464.036260 135.08011 1.355495e+05 1.351583e+05 56.583157 NaN 1.907309 7.304200
std 105.389282 76.78418 8.057998e+04 7.813303e+04 32.826638 NaN 0.643654 3.083226
min 43282.000000 1.00000 1.000000e+03 1.000000e+00 1.000000 NaN 1.000000 1.500000
25% 43373.000000 70.00000 7.002100e+04 6.760150e+04 28.000000 NaN 2.000000 5.400000
50% 43464.000000 130.00000 1.303575e+05 1.351375e+05 56.000000 NaN 2.000000 7.400000
75% 43555.000000 203.00000 2.030942e+05 2.027012e+05 85.000000 NaN 2.000000 9.200000
max 43646.000000 272.00000 2.373711e+06 2.415841e+06 114.000000 NaN 200.000000 650.000000
In [ ]:
 
In [7]:
print(purchase_behaviour.duplicated().sum())
print(transaction_data.duplicated().sum())
0
1
In [8]:
transaction_data.drop_duplicates(inplace=True)
In [9]:
print(purchase_behaviour.duplicated().sum())
print(transaction_data.duplicated().sum())
0
0
In [10]:
print("Purchase Behaviour")
purchase_behaviour.info()

purchase_behaviour.describe(include="all")
Purchase Behaviour
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72637 entries, 0 to 72636
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   LYLTY_CARD_NBR    72637 non-null  int64 
 1   LIFESTAGE         72637 non-null  object
 2   PREMIUM_CUSTOMER  72637 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB
Out[10]:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
count 7.263700e+04 72637 72637
unique NaN 7 3
top NaN RETIREES Mainstream
freq NaN 14805 29245
mean 1.361859e+05 NaN NaN
std 8.989293e+04 NaN NaN
min 1.000000e+03 NaN NaN
25% 6.620200e+04 NaN NaN
50% 1.340400e+05 NaN NaN
75% 2.033750e+05 NaN NaN
max 2.373711e+06 NaN NaN
In [11]:
print("Transaction Data")
transaction_data.info()

transaction_data.describe()
Transaction Data
<class 'pandas.core.frame.DataFrame'>
Index: 264835 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DATE            264835 non-null  int64  
 1   STORE_NBR       264835 non-null  int64  
 2   LYLTY_CARD_NBR  264835 non-null  int64  
 3   TXN_ID          264835 non-null  int64  
 4   PROD_NBR        264835 non-null  int64  
 5   PROD_NAME       264835 non-null  object 
 6   PROD_QTY        264835 non-null  int64  
 7   TOT_SALES       264835 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 18.2+ MB
Out[11]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
count 264835.000000 264835.000000 2.648350e+05 2.648350e+05 264835.000000 264835.000000 264835.000000
mean 43464.036600 135.080216 1.355496e+05 1.351584e+05 56.583201 1.907308 7.304205
std 105.389336 76.784306 8.058011e+04 7.813316e+04 32.826692 0.643655 3.083231
min 43282.000000 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.500000
25% 43373.000000 70.000000 7.002100e+04 6.760100e+04 28.000000 2.000000 5.400000
50% 43464.000000 130.000000 1.303580e+05 1.351380e+05 56.000000 2.000000 7.400000
75% 43555.000000 203.000000 2.030945e+05 2.027015e+05 85.000000 2.000000 9.200000
max 43646.000000 272.000000 2.373711e+06 2.415841e+06 114.000000 200.000000 650.000000
In [ ]:
 

4. Data Cleaning¶

  • Fix date column format

  • Check for missing values

  • Detect & remove outliers (e.g., extremely high PROD_QTY)

  • Create new features:

    • Pack_Size

    • Brand

In [12]:
# Convert DATE to datetime
transaction_data["DATE"] = pd.to_datetime(transaction_data["DATE"], unit='D', origin='1899-12-30')
transaction_data.info()
display(transaction_data.head())
<class 'pandas.core.frame.DataFrame'>
Index: 264835 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   DATE            264835 non-null  datetime64[ns]
 1   STORE_NBR       264835 non-null  int64         
 2   LYLTY_CARD_NBR  264835 non-null  int64         
 3   TXN_ID          264835 non-null  int64         
 4   PROD_NBR        264835 non-null  int64         
 5   PROD_NAME       264835 non-null  object        
 6   PROD_QTY        264835 non-null  int64         
 7   TOT_SALES       264835 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 18.2+ MB
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8
In [13]:
# Missing values
print("Missing Values for Purchase Behaviour:")
print(purchase_behaviour.isnull().sum())

print("\nMissing Values for Transaction Data:")
print(transaction_data.isnull().sum())
Missing Values for Purchase Behaviour:
LYLTY_CARD_NBR      0
LIFESTAGE           0
PREMIUM_CUSTOMER    0
dtype: int64

Missing Values for Transaction Data:
DATE              0
STORE_NBR         0
LYLTY_CARD_NBR    0
TXN_ID            0
PROD_NBR          0
PROD_NAME         0
PROD_QTY          0
TOT_SALES         0
dtype: int64
In [14]:
# -----------------------------
# Outlier Detection & Removal
# -----------------------------

import matplotlib.pyplot as plt
import seaborn as sns

# --- Before Cleaning ---
print("Before Cleaning:")
display(transaction_data[["PROD_QTY", "TOT_SALES"]].describe())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=transaction_data["PROD_QTY"])
plt.title("PROD_QTY (Before Cleaning)")

plt.subplot(1, 2, 2)
sns.boxplot(y=transaction_data["TOT_SALES"])
plt.title("TOT_SALES (Before Cleaning)")
plt.show()
Before Cleaning:
PROD_QTY TOT_SALES
count 264835.000000 264835.000000
mean 1.907308 7.304205
std 0.643655 3.083231
min 1.000000 1.500000
25% 2.000000 5.400000
50% 2.000000 7.400000
75% 2.000000 9.200000
max 200.000000 650.000000
No description has been provided for this image
In [15]:
# -----------------------------
# 2. Transaction Insights
# -----------------------------
# Distribution of total sales
transaction_data["TOT_SALES"] = transaction_data["TOT_SALES"].astype(float)
plt.figure(figsize=(8,5))
sns.histplot(transaction_data['TOT_SALES'], bins=50, kde=True, color="royalblue")
plt.title("Distribution of Total Sales")
plt.show()


# Aggregate sales & quantities by product quantity ranges
qty_stats = transaction_data.groupby("PROD_QTY").agg(
    Total_Sales=("TOT_SALES", "sum"),
    Total_Qty=("PROD_QTY", "count")   # count of transactions with that quantity
)

# Plot chart
plt.figure(figsize=(12,6))
ax = qty_stats["Total_Sales"].plot(kind="bar", color="blue", rot=0)

plt.title("Sales & Transactions by Product Quantity")
plt.xlabel("Product Quantity")
plt.ylabel("Total Sales ($)")

# Add labels with sales and quantity
for i, (sales, qty) in enumerate(zip(qty_stats["Total_Sales"], qty_stats["Total_Qty"])):
    ax.text(
        i, sales + (0.01 * qty_stats["Total_Sales"].max()),
        f"$ {sales:,.0f}\nQty: {qty:,}",   # $ for sales + Qty count
        ha='center', va='bottom', fontsize=9, fontweight='bold'
    )

plt.show()

# Average chips per transaction
avg_qty = transaction_data.groupby("LYLTY_CARD_NBR")["PROD_QTY"].mean().mean()
print(f"Average Chips per Transaction: {avg_qty:.2f}")
No description has been provided for this image
No description has been provided for this image
Average Chips per Transaction: 1.82
In [16]:
transaction_data[transaction_data["PROD_QTY"] == 200]
Out[16]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
69762 2018-08-19 226 226000 226201 4 Dorito Corn Chp Supreme 380g 200 650.0
69763 2019-05-20 226 226000 226210 4 Dorito Corn Chp Supreme 380g 200 650.0
In [17]:
transaction_data[transaction_data["TOT_SALES"] == 650]
Out[17]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
69762 2018-08-19 226 226000 226201 4 Dorito Corn Chp Supreme 380g 200 650.0
69763 2019-05-20 226 226000 226210 4 Dorito Corn Chp Supreme 380g 200 650.0
In [18]:
transaction_data[transaction_data["LYLTY_CARD_NBR"] == 226000]
Out[18]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
69762 2018-08-19 226 226000 226201 4 Dorito Corn Chp Supreme 380g 200 650.0
69763 2019-05-20 226 226000 226210 4 Dorito Corn Chp Supreme 380g 200 650.0
In [19]:
# Filter suspicious customer
important_customer = transaction_data[(transaction_data["STORE_NBR"] == 226) & (transaction_data["LYLTY_CARD_NBR"] == 226000)]

# Save to CSV
important_customer.to_csv("Saved_Data/important_customer.csv", index=False)

print("Important customer data saved successfully!")
Important customer data saved successfully!
In [20]:
# --- Outlier Removal Rules ---
# Most transactions are for 1–5 packs, so let's filter extreme values
cleaned_data = transaction_data[(transaction_data["PROD_QTY"] < 200) & 
                                (transaction_data["TOT_SALES"] < 650)]
In [21]:
# --- After Cleaning ---
print("\nAfter Cleaning:")
print(cleaned_data[["PROD_QTY", "TOT_SALES"]].describe())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=cleaned_data["PROD_QTY"])
plt.title("PROD_QTY (After Cleaning)")

plt.subplot(1, 2, 2)
sns.boxplot(y=cleaned_data["TOT_SALES"])
plt.title("TOT_SALES (After Cleaning)")
plt.show()

# Replace old transaction_data with cleaned version for next steps
transaction_data = cleaned_data.copy()
After Cleaning:
            PROD_QTY      TOT_SALES
count  264833.000000  264833.000000
mean        1.905812       7.299351
std         0.343437       2.527244
min         1.000000       1.500000
25%         2.000000       5.400000
50%         2.000000       7.400000
75%         2.000000       9.200000
max         5.000000      29.500000
No description has been provided for this image
In [22]:
# -----------------------------
# 2. Transaction Insights
# -----------------------------
# Distribution of total sales
transaction_data["TOT_SALES"] = transaction_data["TOT_SALES"].astype(float)
plt.figure(figsize=(8,5))
sns.histplot(transaction_data['TOT_SALES'], bins=50, kde=True, color="royalblue")
plt.title("Distribution of Total Sales")
plt.show()


# Aggregate sales, transactions, and customers by product quantity
qty_stats = transaction_data.groupby("PROD_QTY").agg(
    Total_Sales=("TOT_SALES", "sum"),
    Total_Qty=("PROD_QTY", "count"),               # number of transactions
    Total_Customers=("LYLTY_CARD_NBR", "nunique")  # number of unique customers
)

# Plot chart
plt.figure(figsize=(12,6))
ax = qty_stats["Total_Sales"].plot(kind="bar", color="blue", rot=0)

plt.title("Sales, Transactions & Customers by Product Quantity", pad=20)
plt.xlabel("Product Quantity")
plt.ylabel("Total Sales ($)")

# Add labels with sales, transactions, and customers
for i, (sales, qty, cust) in enumerate(zip(qty_stats["Total_Sales"], qty_stats["Total_Qty"], qty_stats["Total_Customers"])):
    ax.text(
        i, sales + (0.01 * qty_stats["Total_Sales"].max()),
        f"$ {sales:,.0f}\nQty: {qty:,}\nCust: {cust:,}",   # show sales, transactions & customers
        ha='center', va='bottom', fontsize=9, fontweight='bold'
    )

plt.show()

# Average chips per transaction
avg_qty = transaction_data.groupby("LYLTY_CARD_NBR")["PROD_QTY"].mean().mean()
print(f"Average Chips per Transaction: {avg_qty:.2f}")

# Total number of unique customers overall
total_customers = transaction_data["LYLTY_CARD_NBR"].nunique()
print(f"Total Unique Customers: {total_customers:,}")
No description has been provided for this image
No description has been provided for this image
Average Chips per Transaction: 1.82
Total Unique Customers: 72,636
In [23]:
# calculate valus counts
print("Value Counts for 'PROD_QTY':")
transaction_data["PROD_QTY"].value_counts().sort_index().sort_values(ascending=False)
Value Counts for 'PROD_QTY':
Out[23]:
PROD_QTY
2    236038
1     27518
5       450
3       430
4       397
Name: count, dtype: int64
In [ ]:
 
In [24]:
print("\nValue Counts for 'TOT_SALES':")
transaction_data["TOT_SALES"].value_counts().sort_index().sort_values(ascending=False)
Value Counts for 'TOT_SALES':
Out[24]:
TOT_SALES
9.2     22821
7.4     22513
6.0     20797
7.6     20212
8.8     19900
        ...  
6.9         3
9.3         3
15.5        3
11.2        2
12.4        2
Name: count, Length: 111, dtype: int64
In [25]:
transaction_data.describe()
Out[25]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
count 264833 264833.000000 2.648330e+05 2.648330e+05 264833.000000 264833.000000 264833.000000
mean 2018-12-30 00:52:39.666657792 135.079529 1.355489e+05 1.351577e+05 56.583598 1.905812 7.299351
min 2018-07-01 00:00:00 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.500000
25% 2018-09-30 00:00:00 70.000000 7.002100e+04 6.760000e+04 28.000000 2.000000 5.400000
50% 2018-12-30 00:00:00 130.000000 1.303570e+05 1.351370e+05 56.000000 2.000000 7.400000
75% 2019-03-31 00:00:00 203.000000 2.030940e+05 2.027000e+05 85.000000 2.000000 9.200000
max 2019-06-30 00:00:00 272.000000 2.373711e+06 2.415841e+06 114.000000 5.000000 29.500000
std NaN 76.784189 8.058003e+04 7.813305e+04 32.826498 0.343437 2.527244
In [26]:
# Count transactions by date to see if there are any missing days 
count = transaction_data.groupby(transaction_data['DATE'].dt.date).size().reset_index(name = 'COUNT')
count.shape
Out[26]:
(364, 2)
In [27]:
# There is one day of data missing. First check the range of dates by sorting in time order. 
transaction_data.sort_values(by='DATE')
Out[27]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
205333 2018-07-01 24 24109 20881 54 CCs Original 175g 2 4.2
202059 2018-07-01 236 236023 238660 100 Smiths Crinkle Cut Chips Chs&Onion170g 2 5.8
102495 2018-07-01 45 45100 40977 47 Doritos Corn Chips Original 170g 2 8.8
217968 2018-07-01 21 21284 17968 59 Old El Paso Salsa Dip Tomato Med 300g 2 10.2
149892 2018-07-01 262 262188 262373 114 Kettle Sensations Siracha Lime 150g 2 9.2
... ... ... ... ... ... ... ... ...
202851 2019-06-30 249 249313 251226 40 Thins Chips Seasonedchicken 175g 2 6.6
242865 2019-06-30 58 58097 53258 16 Smiths Crinkle Chips Salt & Vinegar 330g 2 11.4
139326 2019-06-30 66 66024 63265 99 Pringles Sthrn FriedChicken 134g 2 7.4
16872 2019-06-30 271 271203 269422 22 Thins Chips Originl saltd 175g 2 6.6
85334 2019-06-30 63 63204 60732 92 WW Crinkle Cut Chicken 175g 2 3.4

264833 rows × 8 columns

In [28]:
# Generate a list of dates with transactions in ascending order 
date_counts = transaction_data.groupby('DATE').size()

# Then compare to a full list of dates within the same range to find differences between them 
pd.date_range(start = '2018-07-01', end = '2019-06-30' ).difference(date_counts.index)
Out[28]:
DatetimeIndex(['2018-12-25'], dtype='datetime64[ns]', freq='D')
In [29]:
len(transaction_data[transaction_data['DATE'] == '2018-12-25'])
Out[29]:
0
In [30]:
unique_products = transaction_data['PROD_NAME'].unique()
print(unique_products)
['Natural Chip        Compny SeaSalt175g' 'CCs Nacho Cheese    175g'
 'Smiths Crinkle Cut  Chips Chicken 170g'
 'Smiths Chip Thinly  S/Cream&Onion 175g'
 'Kettle Tortilla ChpsHny&Jlpno Chili 150g'
 'Old El Paso Salsa   Dip Tomato Mild 300g'
 'Smiths Crinkle Chips Salt & Vinegar 330g'
 'Grain Waves         Sweet Chilli 210g'
 'Doritos Corn Chip Mexican Jalapeno 150g'
 'Grain Waves Sour    Cream&Chives 210G'
 'Kettle Sensations   Siracha Lime 150g' 'Twisties Cheese     270g'
 'WW Crinkle Cut      Chicken 175g' 'Thins Chips Light&  Tangy 175g'
 'CCs Original 175g' 'Burger Rings 220g'
 'NCC Sour Cream &    Garden Chives 175g'
 'Doritos Corn Chip Southern Chicken 150g' 'Cheezels Cheese Box 125g'
 'Smiths Crinkle      Original 330g'
 'Infzns Crn Crnchers Tangy Gcamole 110g'
 'Kettle Sea Salt     And Vinegar 175g'
 'Smiths Chip Thinly  Cut Original 175g' 'Kettle Original 175g'
 'Red Rock Deli Thai  Chilli&Lime 150g' 'Pringles Sthrn FriedChicken 134g'
 'Pringles Sweet&Spcy BBQ 134g' 'Red Rock Deli SR    Salsa & Mzzrlla 150g'
 'Thins Chips         Originl saltd 175g'
 'Red Rock Deli Sp    Salt & Truffle 150G'
 'Smiths Thinly       Swt Chli&S/Cream175G' 'Kettle Chilli 175g'
 'Doritos Mexicana    170g' 'Smiths Crinkle Cut  French OnionDip 150g'
 'Natural ChipCo      Hony Soy Chckn175g'
 'Dorito Corn Chp     Supreme 380g' 'Twisties Chicken270g'
 'Smiths Thinly Cut   Roast Chicken 175g'
 'Smiths Crinkle Cut  Tomato Salsa 150g'
 'Kettle Mozzarella   Basil & Pesto 175g'
 'Infuzions Thai SweetChili PotatoMix 110g'
 'Kettle Sensations   Camembert & Fig 150g'
 'Smith Crinkle Cut   Mac N Cheese 150g'
 'Kettle Honey Soy    Chicken 175g' 'Thins Chips Seasonedchicken 175g'
 'Smiths Crinkle Cut  Salt & Vinegar 170g'
 'Infuzions BBQ Rib   Prawn Crackers 110g'
 'GrnWves Plus Btroot & Chilli Jam 180g'
 'Tyrrells Crisps     Lightly Salted 165g'
 'Kettle Sweet Chilli And Sour Cream 175g'
 'Doritos Salsa       Medium 300g' 'Kettle 135g Swt Pot Sea Salt'
 'Pringles SourCream  Onion 134g' 'Doritos Corn Chips  Original 170g'
 'Twisties Cheese     Burger 250g'
 'Old El Paso Salsa   Dip Chnky Tom Ht300g'
 'Cobs Popd Swt/Chlli &Sr/Cream Chips 110g'
 'Woolworths Mild     Salsa 300g'
 'Natural Chip Co     Tmato Hrb&Spce 175g'
 'Smiths Crinkle Cut  Chips Original 170g'
 'Cobs Popd Sea Salt  Chips 110g'
 'Smiths Crinkle Cut  Chips Chs&Onion170g'
 'French Fries Potato Chips 175g'
 'Old El Paso Salsa   Dip Tomato Med 300g'
 'Doritos Corn Chips  Cheese Supreme 170g'
 'Pringles Original   Crisps 134g' 'RRD Chilli&         Coconut 150g'
 'WW Original Corn    Chips 200g' 'Thins Potato Chips  Hot & Spicy 175g'
 'Cobs Popd Sour Crm  &Chives Chips 110g'
 'Smiths Crnkle Chip  Orgnl Big Bag 380g'
 'Doritos Corn Chips  Nacho Cheese 170g'
 'Kettle Sensations   BBQ&Maple 150g' 'WW D/Style Chip     Sea Salt 200g'
 'Pringles Chicken    Salt Crips 134g' 'WW Original Stacked Chips 160g'
 'Smiths Chip Thinly  CutSalt/Vinegr175g' 'Cheezels Cheese 330g'
 'Tostitos Lightly    Salted 175g' 'Thins Chips Salt &  Vinegar 175g'
 'Smiths Crinkle Cut  Chips Barbecue 170g' 'Cheetos Puffs 165g'
 'RRD Sweet Chilli &  Sour Cream 165g' 'WW Crinkle Cut      Original 175g'
 'Tostitos Splash Of  Lime 175g' 'Woolworths Medium   Salsa 300g'
 'Kettle Tortilla ChpsBtroot&Ricotta 150g' 'CCs Tasty Cheese    175g'
 'Woolworths Cheese   Rings 190g' 'Tostitos Smoked     Chipotle 175g'
 'Pringles Barbeque   134g' 'WW Supreme Cheese   Corn Chips 200g'
 'Pringles Mystery    Flavour 134g'
 'Tyrrells Crisps     Ched & Chives 165g'
 'Snbts Whlgrn Crisps Cheddr&Mstrd 90g' 'Cheetos Chs & Bacon Balls 190g'
 'Pringles Slt Vingar 134g' 'Infuzions SourCream&Herbs Veg Strws 110g'
 'Kettle Tortilla ChpsFeta&Garlic 150g'
 'Infuzions Mango     Chutny Papadums 70g'
 'RRD Steak &         Chimuchurri 150g' 'RRD Honey Soy       Chicken 165g'
 'Sunbites Whlegrn    Crisps Frch/Onin 90g' 'RRD Salt & Vinegar  165g'
 'Doritos Cheese      Supreme 330g' 'Smiths Crinkle Cut  Snag&Sauce 150g'
 'WW Sour Cream &OnionStacked Chips 160g' 'RRD Lime & Pepper   165g'
 'Natural ChipCo Sea  Salt & Vinegr 175g'
 'Red Rock Deli Chikn&Garlic Aioli 150g'
 'RRD SR Slow Rst     Pork Belly 150g' 'RRD Pc Sea Salt     165g'
 'Smith Crinkle Cut   Bolognese 150g' 'Doritos Salsa Mild  300g']
In [31]:
check_words = ["Chips", "Kettle", "Crisps","Chip","Crackers", "Red Rock Deli", "RRD", "Pringles","Smiths","Smith",]
In [32]:
no_chips = transaction_data[~transaction_data['PROD_NAME'].str.contains('|'.join(check_words), case=False, na=False)]
print(no_chips['PROD_NAME'].unique())
['CCs Nacho Cheese    175g' 'Old El Paso Salsa   Dip Tomato Mild 300g'
 'Grain Waves         Sweet Chilli 210g'
 'Grain Waves Sour    Cream&Chives 210G' 'Twisties Cheese     270g'
 'WW Crinkle Cut      Chicken 175g' 'CCs Original 175g'
 'Burger Rings 220g' 'NCC Sour Cream &    Garden Chives 175g'
 'Cheezels Cheese Box 125g' 'Infzns Crn Crnchers Tangy Gcamole 110g'
 'Doritos Mexicana    170g' 'Dorito Corn Chp     Supreme 380g'
 'Twisties Chicken270g' 'Infuzions Thai SweetChili PotatoMix 110g'
 'GrnWves Plus Btroot & Chilli Jam 180g' 'Doritos Salsa       Medium 300g'
 'Twisties Cheese     Burger 250g'
 'Old El Paso Salsa   Dip Chnky Tom Ht300g'
 'Woolworths Mild     Salsa 300g'
 'Old El Paso Salsa   Dip Tomato Med 300g' 'Cheezels Cheese 330g'
 'Tostitos Lightly    Salted 175g' 'Cheetos Puffs 165g'
 'WW Crinkle Cut      Original 175g' 'Tostitos Splash Of  Lime 175g'
 'Woolworths Medium   Salsa 300g' 'CCs Tasty Cheese    175g'
 'Woolworths Cheese   Rings 190g' 'Cheetos Chs & Bacon Balls 190g'
 'Infuzions SourCream&Herbs Veg Strws 110g'
 'Infuzions Mango     Chutny Papadums 70g'
 'Doritos Cheese      Supreme 330g' 'Doritos Salsa Mild  300g']
In [33]:
salsa_df = transaction_data[transaction_data['PROD_NAME'].str.contains("salsa", case=False, na=False)]
salsa_df['PROD_NAME'].unique()
Out[33]:
array(['Old El Paso Salsa   Dip Tomato Mild 300g',
       'Red Rock Deli SR    Salsa & Mzzrlla 150g',
       'Smiths Crinkle Cut  Tomato Salsa 150g',
       'Doritos Salsa       Medium 300g',
       'Old El Paso Salsa   Dip Chnky Tom Ht300g',
       'Woolworths Mild     Salsa 300g',
       'Old El Paso Salsa   Dip Tomato Med 300g',
       'Woolworths Medium   Salsa 300g', 'Doritos Salsa Mild  300g'],
      dtype=object)
In [34]:
not_salsa = ['Red Rock Deli SR    Salsa & Mzzrlla 150g', 'Smiths Crinkle Cut  Tomato Salsa 150g'] 
In [35]:
True_Salsa = salsa_df[~salsa_df['PROD_NAME'].isin(not_salsa)]
In [36]:
True_Salsa.head()
Out[36]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
5 2019-05-19 4 4074 2982 57 Old El Paso Salsa Dip Tomato Mild 300g 1 5.1
25 2019-05-15 39 39144 35506 57 Old El Paso Salsa Dip Tomato Mild 300g 1 5.1
63 2019-05-15 82 82480 82047 101 Doritos Salsa Medium 300g 1 2.6
71 2018-08-15 94 94233 93956 65 Old El Paso Salsa Dip Chnky Tom Ht300g 1 5.1
74 2018-08-16 97 97159 97271 35 Woolworths Mild Salsa 300g 5 7.5
In [37]:
transaction_data = transaction_data[~transaction_data['PROD_NAME'].isin(True_Salsa['PROD_NAME'])]
In [38]:
salsa_count = len(True_Salsa)
chips_count = len(transaction_data)
salsa_percentage = (salsa_count / (salsa_count + chips_count)) * 100 
print(f"Salsa Percentage: {salsa_percentage:.2f}%")
Salsa Percentage: 5.73%
In [ ]:
 
In [39]:
# Derive features
transaction_data["Pack_Size (g)"] = transaction_data["PROD_NAME"].str.extract(r"(\d+)").astype(str)
transaction_data["Brand"] = transaction_data["PROD_NAME"].str.split().str[0]

# Extract first two words as brand candidate
# transaction_data["Brand"] = transaction_data["PROD_NAME"].str.split().str[0:2].str.join(" ")

# Preview new features
display(transaction_data[["PROD_NAME", "Pack_Size (g)", "Brand"]].head())
PROD_NAME Pack_Size (g) Brand
0 Natural Chip Compny SeaSalt175g 175 Natural
1 CCs Nacho Cheese 175g 175 CCs
2 Smiths Crinkle Cut Chips Chicken 170g 170 Smiths
3 Smiths Chip Thinly S/Cream&Onion 175g 175 Smiths
4 Kettle Tortilla ChpsHny&Jlpno Chili 150g 150 Kettle
In [40]:
transaction_data["Pack_Size (g)"].dtype
Out[40]:
dtype('O')
In [41]:
print(transaction_data["PROD_NAME"].nunique())
transaction_data["PROD_NAME"].unique()
107
Out[41]:
array(['Natural Chip        Compny SeaSalt175g',
       'CCs Nacho Cheese    175g',
       'Smiths Crinkle Cut  Chips Chicken 170g',
       'Smiths Chip Thinly  S/Cream&Onion 175g',
       'Kettle Tortilla ChpsHny&Jlpno Chili 150g',
       'Smiths Crinkle Chips Salt & Vinegar 330g',
       'Grain Waves         Sweet Chilli 210g',
       'Doritos Corn Chip Mexican Jalapeno 150g',
       'Grain Waves Sour    Cream&Chives 210G',
       'Kettle Sensations   Siracha Lime 150g',
       'Twisties Cheese     270g', 'WW Crinkle Cut      Chicken 175g',
       'Thins Chips Light&  Tangy 175g', 'CCs Original 175g',
       'Burger Rings 220g', 'NCC Sour Cream &    Garden Chives 175g',
       'Doritos Corn Chip Southern Chicken 150g',
       'Cheezels Cheese Box 125g', 'Smiths Crinkle      Original 330g',
       'Infzns Crn Crnchers Tangy Gcamole 110g',
       'Kettle Sea Salt     And Vinegar 175g',
       'Smiths Chip Thinly  Cut Original 175g', 'Kettle Original 175g',
       'Red Rock Deli Thai  Chilli&Lime 150g',
       'Pringles Sthrn FriedChicken 134g', 'Pringles Sweet&Spcy BBQ 134g',
       'Red Rock Deli SR    Salsa & Mzzrlla 150g',
       'Thins Chips         Originl saltd 175g',
       'Red Rock Deli Sp    Salt & Truffle 150G',
       'Smiths Thinly       Swt Chli&S/Cream175G', 'Kettle Chilli 175g',
       'Doritos Mexicana    170g',
       'Smiths Crinkle Cut  French OnionDip 150g',
       'Natural ChipCo      Hony Soy Chckn175g',
       'Dorito Corn Chp     Supreme 380g', 'Twisties Chicken270g',
       'Smiths Thinly Cut   Roast Chicken 175g',
       'Smiths Crinkle Cut  Tomato Salsa 150g',
       'Kettle Mozzarella   Basil & Pesto 175g',
       'Infuzions Thai SweetChili PotatoMix 110g',
       'Kettle Sensations   Camembert & Fig 150g',
       'Smith Crinkle Cut   Mac N Cheese 150g',
       'Kettle Honey Soy    Chicken 175g',
       'Thins Chips Seasonedchicken 175g',
       'Smiths Crinkle Cut  Salt & Vinegar 170g',
       'Infuzions BBQ Rib   Prawn Crackers 110g',
       'GrnWves Plus Btroot & Chilli Jam 180g',
       'Tyrrells Crisps     Lightly Salted 165g',
       'Kettle Sweet Chilli And Sour Cream 175g',
       'Kettle 135g Swt Pot Sea Salt', 'Pringles SourCream  Onion 134g',
       'Doritos Corn Chips  Original 170g',
       'Twisties Cheese     Burger 250g',
       'Cobs Popd Swt/Chlli &Sr/Cream Chips 110g',
       'Natural Chip Co     Tmato Hrb&Spce 175g',
       'Smiths Crinkle Cut  Chips Original 170g',
       'Cobs Popd Sea Salt  Chips 110g',
       'Smiths Crinkle Cut  Chips Chs&Onion170g',
       'French Fries Potato Chips 175g',
       'Doritos Corn Chips  Cheese Supreme 170g',
       'Pringles Original   Crisps 134g',
       'RRD Chilli&         Coconut 150g',
       'WW Original Corn    Chips 200g',
       'Thins Potato Chips  Hot & Spicy 175g',
       'Cobs Popd Sour Crm  &Chives Chips 110g',
       'Smiths Crnkle Chip  Orgnl Big Bag 380g',
       'Doritos Corn Chips  Nacho Cheese 170g',
       'Kettle Sensations   BBQ&Maple 150g',
       'WW D/Style Chip     Sea Salt 200g',
       'Pringles Chicken    Salt Crips 134g',
       'WW Original Stacked Chips 160g',
       'Smiths Chip Thinly  CutSalt/Vinegr175g', 'Cheezels Cheese 330g',
       'Tostitos Lightly    Salted 175g',
       'Thins Chips Salt &  Vinegar 175g',
       'Smiths Crinkle Cut  Chips Barbecue 170g', 'Cheetos Puffs 165g',
       'RRD Sweet Chilli &  Sour Cream 165g',
       'WW Crinkle Cut      Original 175g',
       'Tostitos Splash Of  Lime 175g',
       'Kettle Tortilla ChpsBtroot&Ricotta 150g',
       'CCs Tasty Cheese    175g', 'Woolworths Cheese   Rings 190g',
       'Tostitos Smoked     Chipotle 175g', 'Pringles Barbeque   134g',
       'WW Supreme Cheese   Corn Chips 200g',
       'Pringles Mystery    Flavour 134g',
       'Tyrrells Crisps     Ched & Chives 165g',
       'Snbts Whlgrn Crisps Cheddr&Mstrd 90g',
       'Cheetos Chs & Bacon Balls 190g', 'Pringles Slt Vingar 134g',
       'Infuzions SourCream&Herbs Veg Strws 110g',
       'Kettle Tortilla ChpsFeta&Garlic 150g',
       'Infuzions Mango     Chutny Papadums 70g',
       'RRD Steak &         Chimuchurri 150g',
       'RRD Honey Soy       Chicken 165g',
       'Sunbites Whlegrn    Crisps Frch/Onin 90g',
       'RRD Salt & Vinegar  165g', 'Doritos Cheese      Supreme 330g',
       'Smiths Crinkle Cut  Snag&Sauce 150g',
       'WW Sour Cream &OnionStacked Chips 160g',
       'RRD Lime & Pepper   165g',
       'Natural ChipCo Sea  Salt & Vinegr 175g',
       'Red Rock Deli Chikn&Garlic Aioli 150g',
       'RRD SR Slow Rst     Pork Belly 150g', 'RRD Pc Sea Salt     165g',
       'Smith Crinkle Cut   Bolognese 150g'], dtype=object)
In [42]:
transaction_data["PROD_NAME"].value_counts()
Out[42]:
PROD_NAME
Kettle Mozzarella   Basil & Pesto 175g      3304
Kettle Tortilla ChpsHny&Jlpno Chili 150g    3296
Cobs Popd Swt/Chlli &Sr/Cream Chips 110g    3269
Tyrrells Crisps     Ched & Chives 165g      3268
Cobs Popd Sea Salt  Chips 110g              3265
                                            ... 
Sunbites Whlegrn    Crisps Frch/Onin 90g    1432
RRD Pc Sea Salt     165g                    1431
NCC Sour Cream &    Garden Chives 175g      1419
French Fries Potato Chips 175g              1418
WW Crinkle Cut      Original 175g           1410
Name: count, Length: 107, dtype: int64
In [43]:
print(transaction_data["Brand"].nunique())
transaction_data["Brand"].unique()
28
Out[43]:
array(['Natural', 'CCs', 'Smiths', 'Kettle', 'Grain', 'Doritos',
       'Twisties', 'WW', 'Thins', 'Burger', 'NCC', 'Cheezels', 'Infzns',
       'Red', 'Pringles', 'Dorito', 'Infuzions', 'Smith', 'GrnWves',
       'Tyrrells', 'Cobs', 'French', 'RRD', 'Tostitos', 'Cheetos',
       'Woolworths', 'Snbts', 'Sunbites'], dtype=object)
In [44]:
# Check brands
transaction_data["Brand"].value_counts()
Out[44]:
Brand
Kettle        41288
Smiths        28859
Pringles      25102
Doritos       22041
Thins         14075
RRD           11894
Infuzions     11057
WW            10320
Cobs           9693
Tostitos       9471
Twisties       9454
Tyrrells       6442
Grain          6272
Natural        6050
Red            5885
Cheezels       4603
CCs            4551
Dorito         3183
Infzns         3144
Smith          2963
Cheetos        2927
Snbts          1576
Burger         1564
Woolworths     1516
GrnWves        1468
Sunbites       1432
NCC            1419
French         1418
Name: count, dtype: int64
In [45]:
brand_corrections = {
    "Infzns": "Infuzions",
    "Red": "Red Rock Deli",
    "RRD": "Red Rock Deli",
    "Grain": "Grain Waves",
    "GrnWves": "Grain Waves",
    "Snbts": "Sunbites",
    "Natural": "Natural Chip Co",
    "NCC": "Natural Chip Co",
    "WW": "Woolworths",
    "Smith": "Smiths",
    "Dorito": "Doritos"
}


transaction_data["Brand"] = transaction_data["Brand"].replace(brand_corrections)

print(transaction_data["Brand"].nunique())
# Check that there are no duplicate brands 
transaction_data["Brand"].unique()
20
Out[45]:
array(['Natural Chip Co', 'CCs', 'Smiths', 'Kettle', 'Grain Waves',
       'Doritos', 'Twisties', 'Woolworths', 'Thins', 'Burger', 'Cheezels',
       'Infuzions', 'Red Rock Deli', 'Pringles', 'Tyrrells', 'Cobs',
       'French', 'Tostitos', 'Cheetos', 'Sunbites'], dtype=object)
In [46]:
# Check brands
transaction_data["Brand"].value_counts()
Out[46]:
Brand
Kettle             41288
Smiths             31822
Doritos            25224
Pringles           25102
Red Rock Deli      17779
Infuzions          14201
Thins              14075
Woolworths         11836
Cobs                9693
Tostitos            9471
Twisties            9454
Grain Waves         7740
Natural Chip Co     7469
Tyrrells            6442
Cheezels            4603
CCs                 4551
Sunbites            3008
Cheetos             2927
Burger              1564
French              1418
Name: count, dtype: int64
In [47]:
transaction_data["Pack_Size (g)"].value_counts()
Out[47]:
Pack_Size (g)
175    66389
150    43131
134    25102
110    22387
170    19983
165    15297
330    12540
380     6416
270     6285
210     6272
200     4473
135     3257
250     3169
90      3008
190     2995
160     2970
220     1564
70      1507
180     1468
125     1454
Name: count, dtype: int64
In [48]:
# Calculate spend per transaction
spend_per_txn = transaction_data.groupby("TXN_ID")["TOT_SALES"].sum().reset_index()
spend_per_txn.rename(columns={"TOT_SALES": "Spend_per_Transaction"}, inplace=True)

# Merge back
transaction_data = transaction_data.merge(spend_per_txn, on="TXN_ID", how="left")

# Calculate unit price
transaction_data["Unit_Price"] = transaction_data["TOT_SALES"] / transaction_data["PROD_QTY"]

# Preview
transaction_data[["TXN_ID", "TOT_SALES", "PROD_QTY", "Spend_per_Transaction", "Unit_Price"]].head(10)
Out[48]:
TXN_ID TOT_SALES PROD_QTY Spend_per_Transaction Unit_Price
0 1 6.0 2 6.0 3.00
1 348 6.3 3 6.3 2.10
2 383 2.9 2 2.9 1.45
3 974 15.0 5 15.0 3.00
4 1038 13.8 3 13.8 4.60
5 3333 5.7 1 5.7 5.70
6 3539 3.6 1 3.6 3.60
7 4525 3.9 1 3.9 3.90
8 6900 7.2 2 7.2 3.60
9 7176 5.7 1 5.7 5.70
In [49]:
transaction_data.describe()
Out[49]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES Spend_per_Transaction Unit_Price
count 249667 249667.000000 2.496670e+05 2.496670e+05 249667.000000 249667.000000 249667.000000 249667.000000 249667.000000
mean 2018-12-30 02:26:13.683346176 135.043662 1.355197e+05 1.351228e+05 56.294753 1.906175 7.288822 7.377441 3.818550
min 2018-07-01 00:00:00 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.700000 1.700000 1.320000
25% 2018-09-30 00:00:00 70.000000 7.001600e+04 6.757350e+04 27.000000 2.000000 5.700000 5.800000 3.000000
50% 2018-12-30 00:00:00 130.000000 1.303600e+05 1.351470e+05 53.000000 2.000000 7.400000 7.400000 3.800000
75% 2019-03-31 00:00:00 203.000000 2.030790e+05 2.026325e+05 86.000000 2.000000 8.800000 8.800000 4.600000
max 2019-06-30 00:00:00 272.000000 2.373711e+06 2.415841e+06 114.000000 5.000000 29.500000 33.000000 6.500000
std NaN 76.773600 8.065751e+04 7.813158e+04 33.528625 0.342744 2.475283 2.619908 1.079015
In [ ]:
 
In [50]:
# Unique categories
print("Number of Unique Lifestages:", purchase_behaviour["LIFESTAGE"].nunique())
print("\nUnique Lifestages:", purchase_behaviour["LIFESTAGE"].unique())
print("\n Value Counts:\n", purchase_behaviour["LIFESTAGE"].value_counts())
Number of Unique Lifestages: 7

Unique Lifestages: ['YOUNG SINGLES/COUPLES' 'YOUNG FAMILIES' 'OLDER SINGLES/COUPLES'
 'MIDAGE SINGLES/COUPLES' 'NEW FAMILIES' 'OLDER FAMILIES' 'RETIREES']

 Value Counts:
 LIFESTAGE
RETIREES                  14805
OLDER SINGLES/COUPLES     14609
YOUNG SINGLES/COUPLES     14441
OLDER FAMILIES             9780
YOUNG FAMILIES             9178
MIDAGE SINGLES/COUPLES     7275
NEW FAMILIES               2549
Name: count, dtype: int64
In [51]:
print("Number of Unique Premium Segments:", purchase_behaviour["PREMIUM_CUSTOMER"].nunique())
print("\nUnique Premium Segments:", purchase_behaviour["PREMIUM_CUSTOMER"].unique())
print("\n Value Counts:\n", purchase_behaviour["PREMIUM_CUSTOMER"].value_counts())
Number of Unique Premium Segments: 3

Unique Premium Segments: ['Premium' 'Mainstream' 'Budget']

 Value Counts:
 PREMIUM_CUSTOMER
Mainstream    29245
Budget        24470
Premium       18922
Name: count, dtype: int64
In [52]:
# Count customers per lifestage
lifestage_counts = purchase_behaviour["LIFESTAGE"].value_counts()

# Count customers per premium segment
premium_counts = purchase_behaviour["PREMIUM_CUSTOMER"].value_counts()

# Plot distributions with labels
plt.figure(figsize=(12, 5))

# --- Lifestage ---
plt.subplot(1, 2, 1)
sns.barplot(x=lifestage_counts.index, y=lifestage_counts.values)
plt.xticks(rotation=45)
plt.title("Customer Distribution by Lifestage")

# Add counts on bars
for i, val in enumerate(lifestage_counts.values):
    plt.text(i, val + 50, str(val), ha='center', va='bottom')

# --- Premium Segment ---
plt.subplot(1, 2, 2)
sns.barplot(x=premium_counts.index, y=premium_counts.values)
plt.title("Customer Distribution by Premium Segment")

# Add counts on bars
for i, val in enumerate(premium_counts.values):
    plt.text(i, val + 50, str(val), ha='center', va='bottom')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

5. Merge Datasets¶

In [53]:
df = transaction_data.merge(purchase_behaviour, on="LYLTY_CARD_NBR", how="inner")
df.head()
Out[53]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES Pack_Size (g) Brand Spend_per_Transaction Unit_Price LIFESTAGE PREMIUM_CUSTOMER
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 175 Natural Chip Co 6.0 3.00 YOUNG SINGLES/COUPLES Premium
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 175 CCs 6.3 2.10 MIDAGE SINGLES/COUPLES Budget
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 170 Smiths 2.9 1.45 MIDAGE SINGLES/COUPLES Budget
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 175 Smiths 15.0 3.00 MIDAGE SINGLES/COUPLES Budget
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 150 Kettle 13.8 4.60 MIDAGE SINGLES/COUPLES Budget
In [54]:
df.shape
Out[54]:
(249667, 14)
In [55]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249667 entries, 0 to 249666
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   DATE                   249667 non-null  datetime64[ns]
 1   STORE_NBR              249667 non-null  int64         
 2   LYLTY_CARD_NBR         249667 non-null  int64         
 3   TXN_ID                 249667 non-null  int64         
 4   PROD_NBR               249667 non-null  int64         
 5   PROD_NAME              249667 non-null  object        
 6   PROD_QTY               249667 non-null  int64         
 7   TOT_SALES              249667 non-null  float64       
 8   Pack_Size (g)          249667 non-null  object        
 9   Brand                  249667 non-null  object        
 10  Spend_per_Transaction  249667 non-null  float64       
 11  Unit_Price             249667 non-null  float64       
 12  LIFESTAGE              249667 non-null  object        
 13  PREMIUM_CUSTOMER       249667 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(5), object(5)
memory usage: 26.7+ MB
In [56]:
df.describe()
Out[56]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES Spend_per_Transaction Unit_Price
count 249667 249667.000000 2.496670e+05 2.496670e+05 249667.000000 249667.000000 249667.000000 249667.000000 249667.000000
mean 2018-12-30 02:26:13.683346176 135.043662 1.355197e+05 1.351228e+05 56.294753 1.906175 7.288822 7.377441 3.818550
min 2018-07-01 00:00:00 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.700000 1.700000 1.320000
25% 2018-09-30 00:00:00 70.000000 7.001600e+04 6.757350e+04 27.000000 2.000000 5.700000 5.800000 3.000000
50% 2018-12-30 00:00:00 130.000000 1.303600e+05 1.351470e+05 53.000000 2.000000 7.400000 7.400000 3.800000
75% 2019-03-31 00:00:00 203.000000 2.030790e+05 2.026325e+05 86.000000 2.000000 8.800000 8.800000 4.600000
max 2019-06-30 00:00:00 272.000000 2.373711e+06 2.415841e+06 114.000000 5.000000 29.500000 33.000000 6.500000
std NaN 76.773600 8.065751e+04 7.813158e+04 33.528625 0.342744 2.475283 2.619908 1.079015
In [57]:
df.duplicated().sum()   
Out[57]:
np.int64(0)
In [58]:
# save dataframe
df.to_csv("QVI_Combined_Data.csv", index=False)
In [ ]:
 

6. Exploratory Data Analysis (EDA)¶

In [59]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# Load the combined dataset
df = pd.read_csv("QVI_Combined_Data.csv")
df.head()
Out[59]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES Pack_Size (g) Brand Spend_per_Transaction Unit_Price LIFESTAGE PREMIUM_CUSTOMER
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 175 Natural Chip Co 6.0 3.00 YOUNG SINGLES/COUPLES Premium
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 175 CCs 6.3 2.10 MIDAGE SINGLES/COUPLES Budget
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 170 Smiths 2.9 1.45 MIDAGE SINGLES/COUPLES Budget
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 175 Smiths 15.0 3.00 MIDAGE SINGLES/COUPLES Budget
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 150 Kettle 13.8 4.60 MIDAGE SINGLES/COUPLES Budget
In [60]:
# -----------------------------
# 1. Basic Data Overview
# -----------------------------
display("Data Shape:", df.shape)
display("\nData Types:\n", df.dtypes)
display("\nMissing Values:\n", df.isnull().sum())
display("\nUnique Values per Column:\n", df.nunique())
display("\nStatistical Summary:\n", df.describe(include='all'))
'Data Shape:'
(249667, 14)
'\nData Types:\n'
DATE                      object
STORE_NBR                  int64
LYLTY_CARD_NBR             int64
TXN_ID                     int64
PROD_NBR                   int64
PROD_NAME                 object
PROD_QTY                   int64
TOT_SALES                float64
Pack_Size (g)              int64
Brand                     object
Spend_per_Transaction    float64
Unit_Price               float64
LIFESTAGE                 object
PREMIUM_CUSTOMER          object
dtype: object
'\nMissing Values:\n'
DATE                     0
STORE_NBR                0
LYLTY_CARD_NBR           0
TXN_ID                   0
PROD_NBR                 0
PROD_NAME                0
PROD_QTY                 0
TOT_SALES                0
Pack_Size (g)            0
Brand                    0
Spend_per_Transaction    0
Unit_Price               0
LIFESTAGE                0
PREMIUM_CUSTOMER         0
dtype: int64
'\nUnique Values per Column:\n'
DATE                        364
STORE_NBR                   271
LYLTY_CARD_NBR            71517
TXN_ID                   248156
PROD_NBR                    107
PROD_NAME                   107
PROD_QTY                      5
TOT_SALES                   104
Pack_Size (g)                20
Brand                        20
Spend_per_Transaction       183
Unit_Price                   40
LIFESTAGE                     7
PREMIUM_CUSTOMER              3
dtype: int64
'\nStatistical Summary:\n'
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES Pack_Size (g) Brand Spend_per_Transaction Unit_Price LIFESTAGE PREMIUM_CUSTOMER
count 249667 249667.000000 2.496670e+05 2.496670e+05 249667.000000 249667 249667.000000 249667.000000 249667.000000 249667 249667.000000 249667.000000 249667 249667
unique 364 NaN NaN NaN NaN 107 NaN NaN NaN 20 NaN NaN 7 3
top 2018-12-24 NaN NaN NaN NaN Kettle Mozzarella Basil & Pesto 175g NaN NaN NaN Kettle NaN NaN OLDER SINGLES/COUPLES Mainstream
freq 880 NaN NaN NaN NaN 3304 NaN NaN NaN 41288 NaN NaN 51362 96122
mean NaN 135.043662 1.355197e+05 1.351228e+05 56.294753 NaN 1.906175 7.288822 175.283490 NaN 7.377441 3.818550 NaN NaN
std NaN 76.773600 8.065751e+04 7.813158e+04 33.528625 NaN 0.342744 2.475283 59.146873 NaN 2.619908 1.079015 NaN NaN
min NaN 1.000000 1.000000e+03 1.000000e+00 1.000000 NaN 1.000000 1.700000 70.000000 NaN 1.700000 1.320000 NaN NaN
25% NaN 70.000000 7.001600e+04 6.757350e+04 27.000000 NaN 2.000000 5.700000 150.000000 NaN 5.800000 3.000000 NaN NaN
50% NaN 130.000000 1.303600e+05 1.351470e+05 53.000000 NaN 2.000000 7.400000 170.000000 NaN 7.400000 3.800000 NaN NaN
75% NaN 203.000000 2.030790e+05 2.026325e+05 86.000000 NaN 2.000000 8.800000 175.000000 NaN 8.800000 4.600000 NaN NaN
max NaN 272.000000 2.373711e+06 2.415841e+06 114.000000 NaN 5.000000 29.500000 380.000000 NaN 33.000000 6.500000 NaN NaN
In [ ]:
 

Customers

Customer LIFESTAGE¶

In [61]:
import matplotlib.pyplot as plt

# --- Aggregations ---
cust_count = df.groupby("LIFESTAGE")["LYLTY_CARD_NBR"].nunique()   # Count of unique customers
sales_sum = df.groupby("LIFESTAGE")["TOT_SALES"].sum()             # Total sales
txn_count = df.groupby("LIFESTAGE")["TXN_ID"].nunique()            # Count of unique transactions
qty_sum = df.groupby("LIFESTAGE")["PROD_QTY"].sum()                # Total quantity sold

# Dictionary of data
charts = {
    "Count of Customers (LYLTY_CARD_NBR) by Lifestage": cust_count,
    "Sum of Total Sales by Lifestage": sales_sum,
    "Count of Transactions (TXN_ID) by Lifestage": txn_count,
    "Sum of Product Quantity by Lifestage": qty_sum
}

# --- Plot ---
fig, axes = plt.subplots(2, 2, figsize=(14,10))

for ax, (title, data) in zip(axes.flatten(), charts.items()):
    # Sort descending
    data = data.sort_values(ascending=False)
    
    wedges, texts, autotexts = ax.pie(
        data, 
        autopct=lambda p: f"{p:.1f}%", 
        startangle=90
    )
    ax.set_title(title, fontsize=12, fontweight="bold")
    ax.legend(
        wedges, data.index, 
        title="Lifestage", 
        loc="center left", 
        bbox_to_anchor=(1, 0, 0.5, 1)   # move legend to right center
    )

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Customer PREMIUM¶

In [62]:
import matplotlib.pyplot as plt

# --- Aggregations by PREMIUM_CUSTOMER ---
cust_count_prem = df.groupby("PREMIUM_CUSTOMER")["LYLTY_CARD_NBR"].nunique()
sales_sum_prem = df.groupby("PREMIUM_CUSTOMER")["TOT_SALES"].sum()
txn_count_prem = df.groupby("PREMIUM_CUSTOMER")["TXN_ID"].nunique()
qty_sum_prem = df.groupby("PREMIUM_CUSTOMER")["PROD_QTY"].sum()

# Dictionary of data
charts_prem = {
    "Count of Customers (LYLTY_CARD_NBR) by Premium Type": cust_count_prem,
    "Sum of Total Sales by Premium Type": sales_sum_prem,
    "Count of Transactions (TXN_ID) by Premium Type": txn_count_prem,
    "Sum of Product Quantity by Premium Type": qty_sum_prem
}

# --- Plot ---
fig, axes = plt.subplots(2, 2, figsize=(14,10))

for ax, (title, data) in zip(axes.flatten(), charts_prem.items()):
    # Sort descending for consistent layout
    data = data.sort_values(ascending=False)
    
    wedges, texts, autotexts = ax.pie(
        data, 
        autopct=lambda p: f"{p:.1f}%", 
        startangle=90
    )
    ax.set_title(title, fontsize=12, fontweight="bold")
    ax.legend(
        wedges, data.index, 
        title="PREMIUM_CUSTOMER", 
        loc="center left", 
        bbox_to_anchor=(1, 0, 0.5, 1)   # legend at right
    )

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

PREMIUM WITH LIFESTAGE¶

In [63]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# --- Aggregations by PREMIUM_CUSTOMER and LIFESTAGE ---
# 1. Sum of TOT_SALES by PREMIUM_CUSTOMER and LIFESTAGE
sales_pivot = df.pivot_table(
    values='TOT_SALES', 
    index='PREMIUM_CUSTOMER', 
    columns='LIFESTAGE', 
    aggfunc='sum'
)

# Calculate percentages
sales_percentages = sales_pivot.div(sales_pivot.sum(axis=1), axis=0) * 100

# 2. Count of LYLTY_CARD_NBR by PREMIUM_CUSTOMER and LIFESTAGE
unique_customers = df.drop_duplicates('LYLTY_CARD_NBR')[['LYLTY_CARD_NBR', 'LIFESTAGE', 'PREMIUM_CUSTOMER']]
customer_pivot = pd.crosstab(
    index=unique_customers['PREMIUM_CUSTOMER'],
    columns=unique_customers['LIFESTAGE']
)
customer_percentages = customer_pivot.div(customer_pivot.sum(axis=1), axis=0) * 100

# 3. Count of TXN_ID by PREMIUM_CUSTOMER and LIFESTAGE
txn_pivot = pd.crosstab(
    index=df['PREMIUM_CUSTOMER'],
    columns=df['LIFESTAGE']
)
txn_percentages = txn_pivot.div(txn_pivot.sum(axis=1), axis=0) * 100

# Dictionary of data
charts_data = {
    "Sum of TOT_SALES by PREMIUM_CUSTOMER and LIFESTAGE": sales_percentages,
    "Count of LYLTY_CARD_NBR by PREMIUM_CUSTOMER and LIFESTAGE": customer_percentages,
    "Count of TXN_ID by PREMIUM_CUSTOMER and LIFESTAGE": txn_percentages
}

# --- Plot ---
fig, axes = plt.subplots(1, 3, figsize=(20, 7))

# Define colors for each customer segment
colors = ['#3498db', '#2ecc71', '#e74c3c']  # Blue, Green, Red

for ax, (title, data) in zip(axes.flatten(), charts_data.items()):
    # Prepare data for plotting
    mainstream = data.loc['Mainstream'].fillna(0)
    budget = data.loc['Budget'].fillna(0)
    premium = data.loc['Premium'].fillna(0)
    
    # Create the stacked bars
    x_pos = np.arange(len(data.columns))
    bar_width = 0.8
    
    bars_main = ax.bar(x_pos, mainstream, bar_width, label='Mainstream', color=colors[0], 
                       edgecolor='white', linewidth=0.5)
    bars_budg = ax.bar(x_pos, budget, bar_width, bottom=mainstream, label='Budget', color=colors[1], 
                       edgecolor='white', linewidth=0.5)
    bars_prem = ax.bar(x_pos, premium, bar_width, bottom=mainstream+budget, label='Premium', color=colors[2], 
                       edgecolor='white', linewidth=0.5)
    
    # Customize the chart
    ax.set_title(title, fontsize=12, fontweight="bold", pad=20)
    ax.set_xlabel('Lifestage', fontweight='bold')
    ax.set_ylabel('Percentage', fontweight='bold')
    
    # Set x-axis labels
    ax.set_xticks(x_pos)
    ax.set_xticklabels(data.columns, rotation=45, ha='right')
    
    # Set y-axis limit and grid
    ax.set_ylim(0, 100)
    ax.yaxis.grid(True, linestyle='--', alpha=0.7)
    
    # Add legend
    ax.legend(loc='upper right')
    
    # Add percentage labels to the bars
    for i, (m, b, p) in enumerate(zip(mainstream, budget, premium)):
        total = m + b + p
        if m > 5:
            ax.text(i, m/2, f'{m:.1f}%', ha='center', va='center', fontweight='bold', fontsize=9)
        if b > 5:
            ax.text(i, m + b/2, f'{b:.1f}%', ha='center', va='center', fontweight='bold', fontsize=9)
        if p > 5:
            ax.text(i, m + b + p/2, f'{p:.1f}%', ha='center', va='center', fontweight='bold', fontsize=9)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

LIFESTAGE WITH PREMIUM¶

In [64]:
import pandas as pd
import matplotlib.pyplot as plt


# Calculate the total sales, customer count, and product quantity for each segment
total_sales = df.groupby(['LIFESTAGE', 'PREMIUM_CUSTOMER'])['TOT_SALES'].sum().unstack()
customer_count = df.groupby(['LIFESTAGE', 'PREMIUM_CUSTOMER'])['LYLTY_CARD_NBR'].nunique().unstack()
total_quantity = df.groupby(['LIFESTAGE', 'PREMIUM_CUSTOMER'])['PROD_QTY'].sum().unstack()

# Sort the lifestages based on total sales in descending order
sorted_lifestage = total_sales.sum(axis=1).sort_values(ascending=True).index
total_sales = total_sales.loc[sorted_lifestage]
customer_count = customer_count.loc[sorted_lifestage]
total_quantity = total_quantity.loc[sorted_lifestage]

# Convert the data to 100% stacked percentages
sales_percent = total_sales.div(total_sales.sum(axis=1), axis=0) * 100
customers_percent = customer_count.div(customer_count.sum(axis=1), axis=0) * 100
quantity_percent = total_quantity.div(total_quantity.sum(axis=1), axis=0) * 100

# Create a figure with three subplots, arranged in 3 rows and 1 column
fig, axes = plt.subplots(3, 1, figsize=(15, 15))

# Plot the 100% stacked bar charts
sales_percent.plot(kind='barh', stacked=True, ax=axes[0])
customers_percent.plot(kind='barh', stacked=True, ax=axes[1])
quantity_percent.plot(kind='barh', stacked=True, ax=axes[2])

# Function to add percentage labels to the bars
def add_labels(ax):
    for container in ax.containers:
        for i, bar in enumerate(container):
            width = bar.get_width()
            if width > 0:
                ax.annotate(f'{width:.1f}%',
                            xy=(bar.get_x() + width / 2, bar.get_y() + bar.get_height() / 2),
                            ha='center', va='center', color='black',
                            fontsize=9, fontweight='bold')

# Add labels and titles to the charts
axes[0].set_title('Total Sales Distribution')
axes[0].set_xlabel('Total Sales (%)')
axes[0].set_ylabel('Lifestage')
axes[0].legend(title='Premium Customer', loc='upper right')
add_labels(axes[0])

axes[1].set_title('Customer Distribution')
axes[1].set_xlabel('Count of Customers (%)')
axes[1].set_ylabel('Lifestage')
axes[1].legend(title='Premium Customer', loc='upper right')
add_labels(axes[1])

axes[2].set_title('Transaction Distribution')
axes[2].set_xlabel('Count of Transactions (%)')
axes[2].set_ylabel('Lifestage')
axes[2].legend(title='Premium Customer', loc='upper right')
add_labels(axes[2])

# Adjust layout and show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

PREMIUM WIth Brand¶

In [65]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
brand_sales = df.groupby(["PREMIUM_CUSTOMER", "Brand"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
brand_sales_pivot = brand_sales.pivot(
    index="PREMIUM_CUSTOMER", 
    columns="Brand", 
    values="TOT_SALES"
).fillna(0)

# --- Sort PREMIUM_CUSTOMER by total sales ---
brand_sales_pivot = brand_sales_pivot.loc[
    brand_sales_pivot.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort brands (columns) by overall total sales ---
brand_order = brand_sales_pivot.sum(axis=0).sort_values(ascending=False).index
brand_sales_pivot = brand_sales_pivot[brand_order]

# Convert to percentages (100% stacked)
brand_sales_percent = brand_sales_pivot.div(brand_sales_pivot.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = brand_sales_percent.plot(
    kind="bar", 
    stacked=True, 
    rot=0,
    figsize=(14, 7),
    colormap="tab20"   # large color palette for brands
)

plt.title("Sales by Premium Customer and Brand", fontsize=14, fontweight="bold")
plt.xlabel("Premium Customer Type")
plt.ylabel("Percentage of Sales (%)")

# Add percentage labels (only if big enough, e.g. >5%)
for i, row in enumerate(brand_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:
            ax.text(i, cum_sum + val/2, f"{val:.1f}%", 
                    ha="center", va="center", fontsize=8, fontweight="bold", color="white")
        cum_sum += val

# Custom sorted legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, brand_order, title="Brand", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

LIFESTAGE WIth Brand¶

In [66]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
brand_sales = df.groupby(["LIFESTAGE", "Brand"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
brand_sales_pivot = brand_sales.pivot(
    index="LIFESTAGE", 
    columns="Brand", 
    values="TOT_SALES"
).fillna(0)

# --- Sort LIFESTAGE by total sales ---
brand_sales_pivot = brand_sales_pivot.loc[
    brand_sales_pivot.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort Brands by overall sales ---
brand_order = brand_sales_pivot.sum(axis=0).sort_values(ascending=False).index
brand_sales_pivot = brand_sales_pivot[brand_order]

# Convert to percentages (100% stacked)
brand_sales_percent = brand_sales_pivot.div(brand_sales_pivot.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = brand_sales_percent.plot(
    kind="bar", 
    stacked=True, 
    rot=45,   # rotate labels for readability
    figsize=(14, 7),
    colormap="tab20"
)

plt.title("Sales by LIFESTAGE Customer and Brand", fontsize=14, fontweight="bold")
plt.xlabel("LIFESTAGE Customer Type")
plt.ylabel("Percentage of Sales (%)")

# Add percentage labels (only if >5%)
for i, row in enumerate(brand_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:
            ax.text(i, cum_sum + val/2, f"{val:.1f}%", 
                    ha="center", va="center", fontsize=8, fontweight="bold", color="white")
        cum_sum += val

# Custom legend sorted by sales
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, brand_order, title="Brand", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

PREIMIUM With Store¶

In [67]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and premium ---
store_premium_sales = (
    df.groupby(["PREMIUM_CUSTOMER", "STORE_NBR"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Get top 10 stores by total sales ---
top_stores = (
    store_premium_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# --- Filter for top 10 stores ---
store_premium_sales = store_premium_sales[
    store_premium_sales["STORE_NBR"].isin(top_stores)
]

# Pivot for stacked bar
pivot_data = store_premium_sales.pivot(
    index="PREMIUM_CUSTOMER",
    columns="STORE_NBR",
    values="TOT_SALES"
).fillna(0)

# --- Sort PREMIUM_CUSTOMER (x-axis) by total sales ---
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort stores (legend) by total sales contribution ---
store_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[store_order]

# Convert to percentages (100% stacked)
pivot_percent = pivot_data.div(pivot_data.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pivot_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 7),
    colormap="tab10"
)

plt.title("100% Stacked Bar - Sales by Premium Customer and Top 10 Stores", fontsize=14, fontweight="bold")
plt.xlabel("Premium Customer Type")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=0)

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show only if >5%
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold",
                color="white"
            )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
# Reorder legend by store_order
sorted_handles = [handles[labels.index(str(s))] for s in store_order]
ax.legend(sorted_handles, store_order, title="Store", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

LIFESTAGE with Store¶

In [68]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and lifestage ---
store_lifestage_sales = (
    df.groupby(["LIFESTAGE", "STORE_NBR"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Get top 10 stores by total sales ---
top_stores = (
    store_lifestage_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# --- Filter for top 10 stores ---
store_lifestage_sales = store_lifestage_sales[
    store_lifestage_sales["STORE_NBR"].isin(top_stores)
]

# Pivot for stacked bar
pivot_data = store_lifestage_sales.pivot(
    index="LIFESTAGE",
    columns="STORE_NBR",
    values="TOT_SALES"
).fillna(0)

# --- Sort LIFESTAGE (x-axis) by total sales ---
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort stores (legend) by total sales contribution ---
store_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[store_order]

# Convert to percentages (100% stacked)
pivot_percent = pivot_data.div(pivot_data.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pivot_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 7),
    colormap="tab10"
)

plt.title("100% Stacked Bar - Sales by Lifestage and Top 10 Stores", fontsize=14, fontweight="bold")
plt.xlabel("Lifestage")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=45, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show only if >5%
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold",
                color="white"
            )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(str(s))] for s in store_order]
ax.legend(sorted_handles, store_order, title="Store", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Brand

In [69]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
brand_sales = df.groupby("Brand")["TOT_SALES"].sum().sort_values(ascending=True)  # ascending for nice horizontal bars

# --- Plot ---
plt.figure(figsize=(12, 7))
ax = brand_sales.plot(kind="barh", color="skyblue")

plt.title("Total Sales by Brand", fontsize=14, fontweight="bold")
plt.xlabel("Total Sales ($)")
plt.ylabel("Brand")

# Add labels on bars
for i, v in enumerate(brand_sales):
    ax.text(v + (0.01 * brand_sales.max()), i, f"$ {v:,.0f}", 
            va="center", fontsize=9, fontweight="bold")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Brand with PREIMIUM¶

In [70]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
brand_sales = df.groupby(["PREMIUM_CUSTOMER", "Brand"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
brand_sales_pivot = brand_sales.pivot(
    index="Brand", 
    columns="PREMIUM_CUSTOMER", 
    values="TOT_SALES"
).fillna(0)

# Convert to percentages for 100% stacked bar
brand_sales_percent = brand_sales_pivot.div(brand_sales_pivot.sum(axis=1), axis=0) * 100

# Sort brands by total sales (ascending for readability)
brand_sales_percent = brand_sales_percent.loc[
    brand_sales_pivot.sum(axis=1).sort_values(ascending=True).index
]

# --- Plot ---
ax = brand_sales_percent.plot(
    kind="barh", 
    stacked=True, 
    figsize=(12, 8),
    color=["#3498db", "#2ecc71", "#e74c3c"]  # Mainstream, Budget, Premium
)

plt.title("Sales by Brand and Premium Customer", fontsize=14, fontweight="bold")
plt.xlabel("Percentage of Sales (%)")
plt.ylabel("Brand")

# Add percentage labels inside bars
for i, row in enumerate(brand_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show label only if >5%
            ax.text(cum_sum + val/2, i, f"{val:.1f}%", 
                    ha="center", va="center", fontsize=9, fontweight="bold", color="white")
        cum_sum += val

# --- Sort legend by total sales contribution ---
handles, labels = ax.get_legend_handles_labels()
premium_totals = brand_sales_pivot.sum(axis=0).sort_values(ascending=False)
sorted_labels = premium_totals.index.tolist()
sorted_handles = [handles[labels.index(l)] for l in sorted_labels]

ax.legend(
    sorted_handles, sorted_labels,
    title="PREMIUM_CUSTOMER",
    bbox_to_anchor=(1.05, 1), loc="upper left"
)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Brand With LIFESTAGE¶

In [71]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
brand_sales = df.groupby(["LIFESTAGE", "Brand"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
brand_sales_pivot = brand_sales.pivot(
    index="Brand", 
    columns="LIFESTAGE", 
    values="TOT_SALES"
).fillna(0)

# Convert to percentages for 100% stacked bar
brand_sales_percent = brand_sales_pivot.div(brand_sales_pivot.sum(axis=1), axis=0) * 100

# Sort brands by total sales (ascending for better layout)
brand_sales_percent = brand_sales_percent.loc[
    brand_sales_pivot.sum(axis=1).sort_values(ascending=True).index
]

# --- Plot ---
ax = brand_sales_percent.plot(
    kind="barh", 
    stacked=True, 
    figsize=(14, 9),
    colormap="tab20"  # large palette for many lifestages
)

plt.title("Sales by Brand and Lifestage", fontsize=14, fontweight="bold")
plt.xlabel("Percentage of Sales (%)")
plt.ylabel("Brand")

# Add percentage labels inside bars
for i, row in enumerate(brand_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show label only if >5%
            ax.text(cum_sum + val/2, i, f"{val:.1f}%", 
                    ha="center", va="center", fontsize=8, fontweight="bold", color="white")
        cum_sum += val

# --- Sort legend by total sales contribution ---
handles, labels = ax.get_legend_handles_labels()
# Sum by lifestage across all brands
lifestage_totals = brand_sales_pivot.sum(axis=0).sort_values(ascending=False)
sorted_labels = lifestage_totals.index.tolist()
sorted_handles = [handles[labels.index(l)] for l in sorted_labels]

ax.legend(sorted_handles, sorted_labels, title="Lifestage", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Brand With Store¶

In [72]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
store_sales = df.groupby(["STORE_NBR", "Brand"])["TOT_SALES"].sum().reset_index()

# Keep only top 10 stores by total sales
top_stores = store_sales.groupby("STORE_NBR")["TOT_SALES"].sum().nlargest(10).index
store_sales_top = store_sales[store_sales["STORE_NBR"].isin(top_stores)]

# Pivot for stacked bar
store_sales_pivot = store_sales_top.pivot(
    index="Brand", 
    columns="STORE_NBR", 
    values="TOT_SALES"
).fillna(0)

# Convert to percentages for 100% stacked bar
store_sales_percent = store_sales_pivot.div(store_sales_pivot.sum(axis=1), axis=0) * 100

# Sort brands by total sales (ascending for better layout)
store_sales_percent = store_sales_percent.loc[
    store_sales_pivot.sum(axis=1).sort_values(ascending=True).index
]

# Sort stores by total sales
store_order = store_sales_top.groupby("STORE_NBR")["TOT_SALES"].sum().sort_values(ascending=False).index
store_sales_percent = store_sales_percent[store_order]

# --- Plot ---
ax = store_sales_percent.plot(
    kind="barh", 
    stacked=True, 
    figsize=(14, 9),
    colormap="tab20"  # large palette for many stores
)

plt.title("Sales by Brand - Top 10 Stores", fontsize=14, fontweight="bold")
plt.xlabel("Percentage of Sales (%)")
plt.ylabel("Brand")

# Add percentage labels inside bars
for i, row in enumerate(store_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show label only if >5%
            ax.text(cum_sum + val/2, i, f"{val:.1f}%", 
                    ha="center", va="center", fontsize=8, fontweight="bold", color="white")
        cum_sum += val

# Sorted legend (top 10 stores)
handles, labels = ax.get_legend_handles_labels()
sorted_labels = [str(s) for s in store_order]
sorted_handles = [handles[labels.index(str(l))] for l in sorted_labels]

ax.legend(sorted_handles, sorted_labels, title="Store", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Stores

Top 10 Stores¶

In [73]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store ---
store_sales = df.groupby("STORE_NBR")["TOT_SALES"].sum().sort_values(ascending=False).head(10)

# --- Plot ---
plt.figure(figsize=(12, 6))
ax = store_sales.plot(
    kind="bar", 
    color="skyblue", 
    edgecolor="black"
)

plt.title("Top 10 Stores by Total Sales", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# Add labels on bars
for i, v in enumerate(store_sales.values):
    ax.text(i, v + 0.01 * store_sales.max(), f"${v:,.0f}", 
            ha="center", va="bottom", fontsize=9, fontweight="bold")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Store with LIFESTAGE¶

In [74]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and lifestage ---
store_lifestage_sales = (
    df.groupby(["STORE_NBR", "LIFESTAGE"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Find top 10 stores by total sales ---
top_stores = (
    store_lifestage_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# --- Filter only top 10 stores ---
store_lifestage_sales = store_lifestage_sales[
    store_lifestage_sales["STORE_NBR"].isin(top_stores)
]

# Pivot for stacked bar chart
pivot_data = store_lifestage_sales.pivot(
    index="STORE_NBR",
    columns="LIFESTAGE",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort Lifestage (columns) by total sales contribution ---
lifestage_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[lifestage_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(14, 7),
    colormap="tab20"
)

plt.title("Top 10 Stores by Sales (Stacked by Lifestage)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in lifestage_order]
ax.legend(sorted_handles, lifestage_order, title="Lifestage", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Store with PREIMUM¶

In [75]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and PREMIUM_CUSTOMER ---
store_premium_sales = (
    df.groupby(["STORE_NBR", "PREMIUM_CUSTOMER"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Find top 10 stores by total sales ---
top_stores = (
    store_premium_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# --- Filter only top 10 stores ---
store_premium_sales = store_premium_sales[
    store_premium_sales["STORE_NBR"].isin(top_stores)
]

# Pivot for stacked bar chart
pivot_data = store_premium_sales.pivot(
    index="STORE_NBR",
    columns="PREMIUM_CUSTOMER",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort PREMIUM_CUSTOMER (columns) by total sales contribution ---
premium_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[premium_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(14, 7),
    color=["#3498db", "#2ecc71", "#e74c3c"]  # Mainstream, Budget, Premium
)

plt.title("Top 10 Stores by Sales (Stacked by Premium Customer)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in premium_order]
ax.legend(sorted_handles, premium_order, title="Premium Customer", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Store with brand¶

In [76]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and Brand ---
store_brand_sales = (
    df.groupby(["STORE_NBR", "Brand"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Find top 10 stores by total sales ---
top_stores = (
    store_brand_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# --- Filter only top 10 stores ---
store_brand_sales = store_brand_sales[
    store_brand_sales["STORE_NBR"].isin(top_stores)
]

# Pivot for stacked bar chart
pivot_data = store_brand_sales.pivot(
    index="STORE_NBR",
    columns="Brand",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort brands (columns) by total sales contribution ---
brand_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[brand_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(16, 8),
    colormap="tab20"   # More colors for many brands
)

plt.title("Top 10 Stores by Sales (Stacked by Brand)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in brand_order]
ax.legend(sorted_handles, brand_order, title="Brand", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Trail Store with LIFESTAGE¶

In [77]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and lifestage ---
store_lifestage_sales = (
    df.groupby(["STORE_NBR", "LIFESTAGE"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Define trial stores ---
trial_stores = [88, 86, 77]

# --- Filter only trial stores ---
store_lifestage_sales = store_lifestage_sales[
    store_lifestage_sales["STORE_NBR"].isin(trial_stores)
]

# Pivot for stacked bar chart
pivot_data = store_lifestage_sales.pivot(
    index="STORE_NBR",
    columns="LIFESTAGE",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort Lifestage (columns) by total sales contribution ---
lifestage_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[lifestage_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(12, 6),
    colormap="tab20"
)

plt.title("Trial Stores by Sales (Stacked by Lifestage)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in lifestage_order]
ax.legend(sorted_handles, lifestage_order, title="Lifestage", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Trail Store with PREIMIUM¶

In [78]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and premium customer ---
store_premium_sales = (
    df.groupby(["STORE_NBR", "PREMIUM_CUSTOMER"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Define trial stores ---
trial_stores = [88, 86, 77]

# --- Filter only trial stores ---
store_premium_sales = store_premium_sales[
    store_premium_sales["STORE_NBR"].isin(trial_stores)
]

# Pivot for stacked bar chart
pivot_data = store_premium_sales.pivot(
    index="STORE_NBR",
    columns="PREMIUM_CUSTOMER",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort Premium types (columns) by total sales contribution ---
premium_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[premium_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(12, 6),
    color=["#3498db", "#2ecc71", "#e74c3c"]  # Mainstream, Budget, Premium
)

plt.title("Trial Stores by Sales (Stacked by Premium Type)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in premium_order]
ax.legend(sorted_handles, premium_order, title="Premium Type", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Trail Store with Brand¶

In [79]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate total sales by store and brand ---
store_brand_sales = (
    df.groupby(["STORE_NBR", "Brand"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Define trial stores ---
trial_stores = [88, 86, 77]

# --- Filter only trial stores ---
store_brand_sales = store_brand_sales[
    store_brand_sales["STORE_NBR"].isin(trial_stores)
]

# Pivot for stacked bar chart
pivot_data = store_brand_sales.pivot(
    index="STORE_NBR",
    columns="Brand",
    values="TOT_SALES"
).fillna(0)

# Sort stores by total sales
pivot_data = pivot_data.loc[
    pivot_data.sum(axis=1).sort_values(ascending=False).index
]

# --- Sort brands (columns) by total sales contribution ---
brand_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[brand_order]

# --- Plot ---
ax = pivot_data.plot(
    kind="bar",
    stacked=True,
    rot=0,
    figsize=(14, 7),
    colormap="tab20"  # large palette for many brands
)

plt.title("Trial Stores by Sales (Stacked by Brand)", fontsize=14, fontweight="bold")
plt.xlabel("Store Number")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# --- Add percentage labels inside bars ---
for i, row in enumerate(pivot_data.values):
    total = row.sum()
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 0:
            percent = (val / total) * 100
            if percent > 5:  # Show only if >5% for clarity
                ax.text(
                    i,
                    cum_sum + val / 2,
                    f"{percent:.1f}%",
                    ha="center",
                    va="center",
                    fontsize=8,
                    fontweight="bold",
                    color="white"
                )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in brand_order]
ax.legend(sorted_handles, brand_order, title="Brand", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Sales

In [80]:
# -----------------------------
# 5. Time-based Analysis
# -----------------------------

import matplotlib.pyplot as plt
import seaborn as sns

# Apply Seaborn style
sns.set_style("whitegrid")

# Convert DATE to datetime
df['DATE'] = pd.to_datetime(df['DATE'])

# Monthly revenue
monthly_sales = df.groupby(pd.Grouper(key="DATE", freq="M"))["TOT_SALES"].sum()

plt.figure(figsize=(12,5))
ax = monthly_sales.plot(
    marker="o", 
    markersize=8, 
    linewidth=2, 
    color="royalblue", 
    alpha=0.8
)

plt.title("Monthly Sales Trend", fontsize=14, fontweight="bold")
plt.ylabel("Total Sales", fontsize=12)
plt.xlabel("Month", fontsize=12)

# Detect highest and lowest month
highest_month = monthly_sales.idxmax()
lowest_month = monthly_sales.idxmin()
highest_value = monthly_sales.max()
lowest_value = monthly_sales.min()

# Highlight highest month
plt.scatter(highest_month, highest_value, color="green", s=100, zorder=5, label="Highest")
ax.annotate(f"High: $ {highest_value:,.0f}",
            xy=(highest_month, highest_value),
            xytext=(highest_month, highest_value + 0.002 * monthly_sales.max()),
            ha="center", fontsize=10, fontweight="bold",
            arrowprops=dict(facecolor="green", arrowstyle="->"))

# Highlight lowest month
plt.scatter(lowest_month, lowest_value, color="red", s=100, zorder=5, label="Lowest")
ax.annotate(f"Low: $ {lowest_value:,.0f}",
            xy=(lowest_month, lowest_value),
            xytext=(lowest_month, lowest_value - 0.005 * monthly_sales.max()),
            ha="center", fontsize=10, fontweight="bold",
            arrowprops=dict(facecolor="red", arrowstyle="->"))


# Add average monthly sales line
avg_sales = monthly_sales.mean()
ax.axhline(avg_sales, color="black", linestyle="--", linewidth=1.5, label=f"Average: $ {avg_sales:,.0f}")


# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

# Add legend
plt.legend()

plt.show()

# -----------------------------
# Highest & Lowest Day Analysis
# -----------------------------

# Daily sales
daily_sales = df.groupby("DATE")["TOT_SALES"].sum()

# Calculate average sales
avg_sales = daily_sales.mean()

# Get highest and lowest
highest_day = daily_sales.idxmax()
lowest_day = daily_sales.idxmin()
highest_day_value = daily_sales.max()
lowest_day_value = daily_sales.min()

# Make sure they are datetime objects
highest_day = pd.to_datetime(highest_day)
lowest_day = pd.to_datetime(lowest_day)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(daily_sales.index, daily_sales.values, marker="o", linewidth=1.2, alpha=0.8)

# Highlight highest
ax.scatter(highest_day, highest_day_value, color="green", s=120, zorder=5, label="Highest")
ax.annotate(f"High: $ {highest_day_value:,.0f}\n{highest_day.strftime('%Y-%m-%d')}",
            xy=(highest_day, highest_day_value),
            # xytext=(highest_day, highest_day_value + 0.008 * daily_sales.max()),
            xytext=(highest_day + pd.Timedelta(days=20), highest_day_value),
            ha="center", fontsize=10, fontweight="bold",
            arrowprops=dict(facecolor="green", arrowstyle="->"))

# Highlight lowest
ax.scatter(lowest_day, lowest_day_value, color="red", s=120, zorder=5, label="Lowest")
ax.annotate(f"Low: $ {lowest_day_value:,.0f}\n{lowest_day.strftime('%Y-%m-%d')}",
            xy=(lowest_day, lowest_day_value),
            # xytext=(lowest_day, lowest_day_value + 0.008 * daily_sales.max()),
            xytext=(lowest_day + pd.Timedelta(days=20), lowest_day_value),  # shift 10 days to the right
            ha="center", fontsize=10, fontweight="bold",
            arrowprops=dict(facecolor="red", arrowstyle="->"))


# Add average line
ax.axhline(avg_sales, color="black", linestyle="--", linewidth=1.5, label=f"Average: $ {avg_sales:,.0f}")


ax.set_title("Daily Sales Trend with Highest & Lowest Days", fontsize=14, fontweight="bold", pad=20)
ax.set_xlabel("Date")
ax.set_ylabel("Total Sales")
ax.legend()
plt.show()
C:\Users\DELL\AppData\Local\Temp\ipykernel_22016\1000782018.py:15: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  monthly_sales = df.groupby(pd.Grouper(key="DATE", freq="M"))["TOT_SALES"].sum()
No description has been provided for this image
No description has been provided for this image
In [ ]:
 

Sales over LIFESTAGE¶

In [81]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Ensure DATE is datetime ---
df["DATE"] = pd.to_datetime(df["DATE"])

# --- Aggregate sales over time by LIFESTAGE ---
sales_time_lifestage = (
    df.groupby([pd.Grouper(key="DATE", freq="M"), "LIFESTAGE"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Pivot for plotting ---
pivot_data = sales_time_lifestage.pivot(
    index="DATE", 
    columns="LIFESTAGE", 
    values="TOT_SALES"
).fillna(0)

# --- Sort legend by total sales ---
totals = pivot_data.sum().sort_values(ascending=False)
pivot_data = pivot_data[totals.index]

# --- Plot ---
plt.figure(figsize=(14, 7))
ax = pivot_data.plot(ax=plt.gca(), linewidth=2, marker="o")

plt.title("Monthly Sales Trend by Lifestage", fontsize=14, fontweight="bold")
plt.xlabel("Date")
plt.ylabel("Total Sales ($)")
plt.grid(alpha=0.3)

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in totals.index]
ax.legend(sorted_handles, totals.index, title="Lifestage", bbox_to_anchor=(1.05, 1), loc="upper left")

# --- Add data labels ---
for line in ax.get_lines():
    for x, y in zip(line.get_xdata(), line.get_ydata()):
        if y > 0:  # avoid labeling empty points
            ax.text(
                x, y, f"${y:,.0f}", 
                ha="center", va="bottom", fontsize=8, fontweight="bold", rotation=45
            )

plt.tight_layout()
plt.show()
C:\Users\DELL\AppData\Local\Temp\ipykernel_22016\1234985238.py:9: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.groupby([pd.Grouper(key="DATE", freq="M"), "LIFESTAGE"])["TOT_SALES"]
No description has been provided for this image
In [ ]:
 

Sales Over PREIMIUM¶

In [82]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Ensure DATE is datetime ---
df["DATE"] = pd.to_datetime(df["DATE"])

# --- Aggregate sales over time by PREMIUM_CUSTOMER ---
sales_time_premium = (
    df.groupby([pd.Grouper(key="DATE", freq="M"), "PREMIUM_CUSTOMER"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# --- Pivot for plotting ---
pivot_data = sales_time_premium.pivot(
    index="DATE", 
    columns="PREMIUM_CUSTOMER", 
    values="TOT_SALES"
).fillna(0)

# --- Sort legend by total sales ---
totals = pivot_data.sum().sort_values(ascending=False)
pivot_data = pivot_data[totals.index]

# --- Plot ---
plt.figure(figsize=(14, 7))
ax = pivot_data.plot(ax=plt.gca(), linewidth=2, marker="o")

plt.title("Monthly Sales Trend by Premium Customer Type", fontsize=14, fontweight="bold")
plt.xlabel("Date")
plt.ylabel("Total Sales ($)")
plt.grid(alpha=0.3)

# Custom sorted legend
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in totals.index]
ax.legend(sorted_handles, totals.index, title="Premium Customer", bbox_to_anchor=(1.05, 1), loc="upper left")

# --- Add data labels ---
for line in ax.get_lines():
    for x, y in zip(line.get_xdata(), line.get_ydata()):
        if y > 0:  # avoid labeling empty points
            ax.text(
                x, y, f"${y:,.0f}", 
                ha="center", va="bottom", fontsize=8, fontweight="bold", rotation=45
            )

plt.tight_layout()
plt.show()
C:\Users\DELL\AppData\Local\Temp\ipykernel_22016\1787893678.py:9: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.groupby([pd.Grouper(key="DATE", freq="M"), "PREMIUM_CUSTOMER"])["TOT_SALES"]
No description has been provided for this image
In [ ]:
 
In [ ]:
 

Pack Size

In [83]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate sales by Pack Size ---
pack_sales = (
    df.groupby("Pack_Size (g)")["TOT_SALES"]
    .sum()
    .sort_index()
).sort_values(ascending=False)

# --- Plot ---
plt.figure(figsize=(12, 6))
ax = pack_sales.plot(kind="bar", color="skyblue", edgecolor="black")

plt.title("Total Sales by Pack Size", fontsize=14, fontweight="bold")
plt.xlabel("Pack Size (g)")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=0, ha="right")

# Add labels on bars
for i, v in enumerate(pack_sales.values):
    ax.text(i, v + 0.01 * pack_sales.max(), f"${v:,.0f}",
            ha="center", va="bottom", fontsize=9, fontweight="bold")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Pack Size Over PREIMIUM¶

In [84]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Aggregate sales by Pack Size and Premium Customer ---
pack_premium_sales = (
    df.groupby(["Pack_Size (g)", "PREMIUM_CUSTOMER"])["TOT_SALES"]
    .sum()
    .reset_index()
)

# Pivot for stacked bar
pivot_data = pack_premium_sales.pivot(
    index="Pack_Size (g)",
    columns="PREMIUM_CUSTOMER",
    values="TOT_SALES"
).fillna(0)

# --- Sort Pack Sizes by total sales (descending) ---
pivot_data = pivot_data.loc[pivot_data.sum(axis=1).sort_values(ascending=False).index]

# --- Sort Premium Customer groups by total sales (descending) ---
premium_order = pivot_data.sum(axis=0).sort_values(ascending=False).index
pivot_data = pivot_data[premium_order]

# --- Convert to percentages (100% stacked) ---
pivot_percent = pivot_data.div(pivot_data.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pivot_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(14, 7),
    colormap="tab10",
    edgecolor="black"
)

plt.title("100% Stacked Sales by Pack Size and Premium Customer", fontsize=14, fontweight="bold")
plt.xlabel("Pack Size (g)")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=45, ha="right")

# Add percentage labels inside bars
for i, row in enumerate(pivot_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Show only if >5% for clarity
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=9,
                fontweight="bold",
                color="black"
            )
        cum_sum += val

# --- Custom sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in premium_order]
ax.legend(sorted_handles, premium_order, title="Premium Customer", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Pack Size Over LIFESTAGE¶

In [85]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
pack_sales = df.groupby(["Pack_Size (g)", "LIFESTAGE"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
pack_sales_pivot = pack_sales.pivot(
    index="Pack_Size (g)",
    columns="LIFESTAGE",
    values="TOT_SALES"
).fillna(0)

# Sort pack sizes by total sales (descending)
pack_sales_pivot = pack_sales_pivot.loc[
    pack_sales_pivot.sum(axis=1).sort_values(ascending=False).index
]

# Sort Lifestages (columns) by overall contribution (descending)
lifestage_order = pack_sales_pivot.sum(axis=0).sort_values(ascending=False).index
pack_sales_pivot = pack_sales_pivot[lifestage_order]

# Convert to percentages (100% stacked)
pack_sales_percent = pack_sales_pivot.div(pack_sales_pivot.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pack_sales_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(14, 7),
    colormap="tab20"
)

plt.title("100% Stacked Sales by Pack Size and Lifestage", fontsize=14, fontweight="bold")
plt.xlabel("Pack Size (g)")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=45, ha="right")

# Add percentage labels inside bars
for i, row in enumerate(pack_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Only label segments >5% for clarity
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold",
                color="black"
            )
        cum_sum += val

# --- Sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(l)] for l in lifestage_order]
ax.legend(sorted_handles, lifestage_order, title="Lifestage", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Pack Size Over Brand¶

In [86]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
pack_sales = df.groupby(["Pack_Size (g)", "Brand"])["TOT_SALES"].sum().reset_index()

# Pivot for stacked bar
pack_sales_pivot = pack_sales.pivot(
    index="Pack_Size (g)",
    columns="Brand",
    values="TOT_SALES"
).fillna(0)

# Sort pack sizes by total sales (descending)
pack_sales_pivot = pack_sales_pivot.loc[
    pack_sales_pivot.sum(axis=1).sort_values(ascending=False).index
]

# Sort brands (columns) by overall contribution (descending)
brand_order = pack_sales_pivot.sum(axis=0).sort_values(ascending=False).index
pack_sales_pivot = pack_sales_pivot[brand_order]

# Convert to percentages (100% stacked)
pack_sales_percent = pack_sales_pivot.div(pack_sales_pivot.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pack_sales_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(14, 7),
    colormap="tab20"
)

plt.title("100% Stacked Sales by Pack Size and Brand", fontsize=14, fontweight="bold")
plt.xlabel("Pack Size (g)")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=45, ha="right")

# Add percentage labels inside bars
for i, row in enumerate(pack_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Only label segments >5% for clarity
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold",
                color="black"
            )
        cum_sum += val

# --- Sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(b)] for b in brand_order]
ax.legend(sorted_handles, brand_order, title="Brand", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Pack Size Over Store¶

In [87]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Prepare data ---
pack_sales = df.groupby(["Pack_Size (g)", "STORE_NBR"])["TOT_SALES"].sum().reset_index()

# Find top 10 stores by total sales
top_stores = (
    pack_sales.groupby("STORE_NBR")["TOT_SALES"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# Filter only top 10 stores
pack_sales = pack_sales[pack_sales["STORE_NBR"].isin(top_stores)]

# Pivot for stacked bar
pack_sales_pivot = pack_sales.pivot(
    index="Pack_Size (g)",
    columns="STORE_NBR",
    values="TOT_SALES"
).fillna(0)

# Sort pack sizes by total sales (descending)
pack_sales_pivot = pack_sales_pivot.loc[
    pack_sales_pivot.sum(axis=1).sort_values(ascending=False).index
]

# Sort stores (columns) by overall contribution (descending)
store_order = pack_sales_pivot.sum(axis=0).sort_values(ascending=False).index
pack_sales_pivot = pack_sales_pivot[store_order]

# Convert to percentages (100% stacked)
pack_sales_percent = pack_sales_pivot.div(pack_sales_pivot.sum(axis=1), axis=0) * 100

# --- Plot ---
ax = pack_sales_percent.plot(
    kind="bar",
    stacked=True,
    figsize=(14, 7),
    colormap="tab10"
)

plt.title("100% Stacked Sales by Pack Size and Top 10 Stores", fontsize=14, fontweight="bold")
plt.xlabel("Pack Size (g)")
plt.ylabel("Percentage of Sales (%)")
plt.xticks(rotation=45, ha="right")

# Add percentage labels inside bars
for i, row in enumerate(pack_sales_percent.values):
    cum_sum = 0
    for j, val in enumerate(row):
        if val > 5:  # Only label segments >5% for clarity
            ax.text(
                i,
                cum_sum + val / 2,
                f"{val:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold",
                color="white"
            )
        cum_sum += val

# --- Sorted legend ---
handles, labels = ax.get_legend_handles_labels()
sorted_handles = [handles[labels.index(str(s))] for s in store_order]
ax.legend(sorted_handles, store_order, title="Top 10 Stores", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: